LAST UPDATE AT
[1] "Fri Aug 14 10:33:33 2020"
Set working directory and load packages
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = '/Users/jann/Desktop/Dependencies/')
getwd()
[1] "/Users/jann/Desktop"
library(stringr)
library(edgeR)
library(statmod)
library(ggplot2)
library(tidyverse)
── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ tibble 3.0.3 ✓ purrr 0.3.4
✓ tidyr 1.1.0 ✓ forcats 0.5.0
✓ readr 1.3.1
Warning: package 'tibble' was built under R version 3.6.2
Warning: package 'tidyr' was built under R version 3.6.2
Warning: package 'purrr' was built under R version 3.6.2
── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x readr::col_factor() masks scales::col_factor()
x purrr::discard() masks scales::discard()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(dplyr)
require(zoo)
library(pheatmap)
library(scales)
library(LSD)
Read in dependency files
#read in annotations 1
fullAnno = read.table("Full_Annotations_table.txt", header=T, sep="\t")
#read in annotations 2
full_features300 = read.table("Full_Features300.txt", header=T, sep="\t")
#read in ORF to Gene conversion file
ORF_to_Gene <- read.delim("ORF_to_Gene_conversion_yeastract.csv", header = T, sep=",")
#read in annotations of essential genes:
essentialAnnotations = read.delim("inviable_annotations.txt", sep = "\t", header = T, skip = 7)
essentialORFs = essentialAnnotations$Gene.Systematic.Name[essentialAnnotations$Strain.Background=="S288C" & essentialAnnotations$Mutant.Information=="null"]
Read in demultiplexed read count files of screens, e.g. cellular fitness at 30°C:
Rep12TF_30degC = read.delim("HS_Screen2_perfectMatch_ReadCounts_TFs.txt")
Rep12TF_30degC <- Rep12TF_30degC[,17:20]
colnames(Rep12TF_30degC) <- sub("11_", "_Rep1_", colnames(Rep12TF_30degC))
colnames(Rep12TF_30degC) <- sub("22_", "_Rep2_", colnames(Rep12TF_30degC))
pre_names <- lapply( strsplit(colnames(Rep12TF_30degC), "_"), function(x){unlist(x)[c(1, 4, 2)]})
colnames(Rep12TF_30degC) <- paste( c(lapply(pre_names, function(z){str_c(z, collapse = "_")})) )
Rep12Kin_30degC = read.delim("HS_Screen2_perfectMatch_ReadCounts_Kinases.txt")
Rep12Kin_30degC <- Rep12Kin_30degC[,13:16]
colnames(Rep12Kin_30degC) <- sub("11_", "_Rep1_", colnames(Rep12Kin_30degC))
colnames(Rep12Kin_30degC) <- sub("22_", "_Rep2_", colnames(Rep12Kin_30degC))
pre_names <- lapply( strsplit(colnames(Rep12Kin_30degC), "_"), function(x){unlist(x)[c(1, 4, 2)]})
colnames(Rep12Kin_30degC) <- paste( c(lapply(pre_names, function(z){str_c(z, collapse = "_")})) )
Filtering low counts based on MinusATc samples (these are unperturbed).
Generate annotation table and define edgeR pipeline function to compute guide log2FCs:
#Generate ann table for TF and Kin libraries:
tblGenerate <- function(ReadCounts){
annTable = t(sapply(strsplit(colnames(ReadCounts),"_"),unlist))
annTable = data.frame(annTable)
colnames(annTable) = c("temperature","atc","replicate")
annTable$grp = paste(annTable$temperature, annTable$atc, sep="_")
annTable$grp <- as.factor(annTable$grp)
annTable$grp <- relevel(annTable$grp, ref="Thirty_MinusATc")
annTable
}
ann_TF <- tblGenerate(ReadCounts = Counts_TF)
ann_Kin <- tblGenerate(ReadCounts = Counts_Kin)
#edgeR pipeline to compute FDRs on guide level:
fitGuideModel <- function(counts, annTable){
Group <- annTable$grp
y = DGEList(counts, group=Group)
y = calcNormFactors(y)
plotMDS(y)
#define model
design <- model.matrix(~0+grp+replicate, data=annTable)
yDisp <- estimateDisp(y, design, robust = T)
#control BCV:
plotBCV(yDisp)
#plotSmear(yDisp)
fit = glmFit(yDisp, design, robust=T)
#test for ATc effect at 30degC
lrt = glmLRT(fit, contrast = c(-1, 1, 0))
tab_ThirtyFC = topTags((lrt),n=nrow(yDisp))$table
#inspect log2FCs
hist(tab_ThirtyFC$logFC, breaks=50)
#inspect p-values
hist(tab_ThirtyFC$PValue, breaks=50)
#inspect FDRs
#hist(tab_ThirtyFC$FDR, breaks=50)
#inspect effects
plotMD(lrt)
abline(h=c(-1,1), col="blue")
tab_ThirtyFC$rname = rownames(tab_ThirtyFC)
tab_ThirtyFC = tab_ThirtyFC %>% mutate(screen = "ThirtyFC")
#merge annotations1
fullAnno$mergeCol <- sapply(fullAnno$mergeCol, function(u){ sub("late", "", u)})
#Merge annotations2 with Gene_ORF_conversion file
#Choosing 150bp distance cutoff to determine transcriptional start sites that are potentially regulated by a gRNA:
full_features150 <- full_features300[abs(full_features300$Midpoint_TSS_dist)<=150,]
tabSelection = list("tab_ThirtyFC"=tab_ThirtyFC)
allTabsList <- lapply(tabSelection, function(i){
i$mergeCol <- sapply(i$rname, function(u){ paste( unlist(strsplit(u, ":"))[c(2, 3, 4) ], collapse = ":") })
i$mergeCol <- sapply(i$mergeCol, function(u){ paste( unlist(strsplit(u, "late-"))[1]) })
i$mergeCol <- sapply(i$mergeCol, function(u){ sub("-noMode-noGuideCtr", "", u) })
#annotate aimed strand and genomic positions
mergedDF <- merge(i, fullAnno, by="mergeCol")
##Guide RNA z-score calculation using mean and sd
mergedDF$z_logFC <- ( mergedDF$logFC - mean(mergedDF$logFC) )/sd(mergedDF$logFC)
full_features300$Nearest_TSS_ORF <- as.character(full_features300$Nearest_TSS_ORF)
full_features300$Seq <- as.character(full_features300$Seq)
mergedDF$Seq <- as.character(mergedDF$Seq)
#annotate target genes:
pre_mergedDF_designedTargets <- merge(mergedDF, full_features300[as.character(full_features300$Nearest_TSS_ORF) %in% as.character(mergedDF$ORF), ],
by="Seq", all.x = T)
mergedDF_designedGeneTargets <- pre_mergedDF_designedTargets[!is.na(pre_mergedDF_designedTargets$Seq), ]
#identify genes that are not designed targets but can potentially be regulated by guide RNAs, using 150bp distance threshold:
full_features150 = full_features300[abs(full_features300$Midpoint_TSS_dist) <=150,]
mergedDF_otherPotentialTargets <- merge(mergedDF, full_features150[!(as.character(full_features150$Nearest_TSS_ORF) %in% as.character(mergedDF_designedGeneTargets$ORF)), ],
by="Seq", all.x = F, all.y = F)
mergedDF_otherPotentialTargets = mergedDF_otherPotentialTargets[!mergedDF_otherPotentialTargets$Nearest_TSS_ORF %in% mergedDF_designedGeneTargets$Nearest_TSS_ORF,]
#potential ORF targets in 150bp distance to the gRNA which are not already included in the designed target ORFs
#annotate target
mergedDF_designedGeneTargets$designedTarget = "designed_target"
mergedDF_otherPotentialTargets$designedTarget = "potential_target"
#combine designed and potential targets:
mergedDF_full = rbind(mergedDF_designedGeneTargets, mergedDF_otherPotentialTargets)
#annotate gRNAs that target essential target genes
mergedDF_full$Nearest_ORF_essential = mergedDF_full$Nearest_TSS_ORF %in% as.character(essentialORFs)
#add gene names if available. Otherwise keep ORF name
knownORFs <- as.character(ORF_to_Gene$ORF_yeastractAnnotation[!ORF_to_Gene$ORF_yeastractAnnotation=="Unknown"])
mergedDF_full$Nearest_Gene <- as.character(mergedDF_full$Nearest_TSS_ORF)
for (k in as.character(mergedDF_full$Nearest_TSS_ORF[mergedDF_full$Nearest_TSS_ORF %in% knownORFs])){
mergedDF_full$Nearest_Gene[mergedDF_full$Nearest_TSS_ORF == k] = as.character(ORF_to_Gene$Gene_yeastractAnnotation[ORF_to_Gene$ORF_yeastractAnnotation == k])
}
#annotate gRNAs with potential other target genes that the designed target gene within a 150bp distance to the guide:
guideRNA_TargetNumber = data.frame(table(as.character(mergedDF_full$Seq)))
colnames(guideRNA_TargetNumber) = c("Seq", "guideRNA_TargetNumber")
mergedDF_full <- merge(mergedDF_full, guideRNA_TargetNumber, by="Seq")
#add columns to specify ORF and Gene names of gRNAs with multiple targets
for (i in mergedDF_full$Seq){
mergedDF_full$TargetORFs150bpDistance[mergedDF_full$Seq==i] <- paste0( sort( mergedDF_full$Nearest_TSS_ORF[mergedDF_full$Seq==i]), collapse = "|" )
mergedDF_full$targetGenes150bpDistance[mergedDF_full$Seq==i] <- paste0( sort( mergedDF_full$Nearest_Gene[mergedDF_full$Seq==i]), collapse = "|" )
#Add number of potentially regulated genes (TSS in 150bp distance):
mergedDF_full$targetORFs150bpDistanceNumber[mergedDF_full$Seq==i] <- length(mergedDF_full$Nearest_TSS_ORF[mergedDF_full$Seq==i])
#indicate which TSS is the closest to the gRNA midpoint
mergedDF_full$Nearest_abs_TSS_distance[mergedDF_full$Seq==i] = min( abs(mergedDF_full$Midpoint_TSS_dist[mergedDF_full$Seq==i] ) )
mergedDF_full$Nearest_Guide_to_TSS[mergedDF_full$Seq==i] <- mergedDF_full$Nearest_abs_TSS_distance[mergedDF_full$Seq==i] == abs(mergedDF_full$Midpoint_TSS_dist[mergedDF_full$Seq==i])
}
#annotate if there is ANY essential gene in 150 bp neighborhood
mergedDF_full$essential_any =
apply(mergedDF_full, 1, function(pp){
orfs150bp <- unlist(strsplit(as.character(pp[33]), split="|", fixed=T))
essential_any = any(orfs150bp %in% as.character(essentialORFs))
return(essential_any)
})
#annotate if intentionally targeted ORF is essential
mergedDF_full$essential_target =
apply(mergedDF_full, 1, function(pp){
targeted_orfs <- as.character(pp[10])
essential_any = any(targeted_orfs %in% as.character(essentialORFs))
return(essential_any)
})
#The gRNAs designed to target GAT4 are further than 1kp to their target TSS and likely to not target any gene effectively and do not have significant FCs.
#The wt sequence is treated as a guide sequence (intact NotI site as non-functional gRNA control).
mergedDF_full[is.na(mergedDF_full$Nearest_Guide_to_TSS),]
##
mergedDF_full <- mergedDF_full[order(mergedDF_full$Seq),]
return(mergedDF_full)
})
return(allTabsList)
}
Run function for libraries
tab_TF <- fitGuideModel(counts = Counts_TF, annTable = ann_TF)
tab_Kin <- fitGuideModel(counts = Counts_Kin, annTable = ann_Kin)
tab_ThirtyFC <- rbind(tab_TF[[1]], tab_Kin[[1]])
#keep only one instance of gRNA sequences with multiple targets (the designed target in gene column). Multiple targets of each gRNA are still annotated in target150bpGenes/ORFs columns.
tab_ThirtyFC_singleGuide <- tab_ThirtyFC[tab_ThirtyFC$designedTarget=="designed_target",]
#The gRNAs designed to target GAT4 are further than 1kp to their target TSS and likely to not target any gene effectively and do not have significant FCs.
#Can be used as control guides
tab_ThirtyFC_singleGuide[!is.na(tab_ThirtyFC_singleGuide$Nearest_TSS_ORF),]
tab_ThirtyFC_singleGuide_noNA <- tab_ThirtyFC_singleGuide[!is.na(tab_ThirtyFC_singleGuide$Seq),]
#The wt sequence is treated as a guide sequence (intact NotI site as non-functional gRNA control).
#This is not an actual gRNA and is omitted here:
tab_ThirtyFC <- tab_ThirtyFC_singleGuide_noNA[-grep("wt", tab_ThirtyFC_singleGuide_noNA$Seq),]
#convert factors
tabs_List = list("tab_ThirtyFC"=tab_ThirtyFC)
tabsData <- lapply(tabs_List, function(i){
i$ORF <- as.character(i$ORF)
i$Gene <- as.character(i$Gene)
i$Identity <- as.character(i$Identity)
i$Chromosome <- as.character(i$Chromosome)
})
tab_ThirtyFC <- tabs_List[[1]]
##Genes that have more than 1 gRNA targeting another gene in 150bp neighborhood:
annotateComplexLoci <- function(df){
#all bidirectional genes based on gRNAs that target the designed TSS and potential other TSSs in 150bp distance
bidirectional_Genes_any = unique(df$targetGenes150bpDistance[df$targetORFs150bpDistanceNumber>=2])
#bidirectional genes covered by at least 2 gRNAs: 55 gene loci
bidirectional_Genes = names(which(table(df$targetGenes150bpDistance[df$targetGenes150bpDistance %in% bidirectional_Genes_any]) >=2))
#regulation table
regTab = table(df[ , c("Gene","targetGenes150bpDistance")])
regDF = as.data.frame(regTab)
regDF$Gene = as.character(regDF$Gene)
regDF$targetGenes150bpDistance = as.character(regDF$targetGenes150bpDistance)
regDF$Freq = as.numeric(regDF$Freq)
regDF = regDF[regDF$Freq>=2, ]
#overview on bidirectional genes
GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS = unique(regDF[regDF$targetGenes150bpDistance %in% bidirectional_Genes, ])
#overview on genes in complex loci with more than 2 TSS in neighborhood
#tabulating on targeted Gene
table(GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS$Gene)
complexLoci = table(GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS$Gene)[table(GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS$Gene)>1]
GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS[GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS$Gene %in% names(complexLoci),]
##Annotate complex loci
#if multiple loci are formed from one common one, only take the largest locus (the one that includes all potentially targeted genes)
ddf = GenesWithMoreThanOneGuidePotentiallyTargetingOtherTSS %>% group_by(Gene) %>% do({
this = .
tgLength = sapply(this$targetGenes150bpDistance, function(u){length(strsplit(u, "|", fixed=T)[[1]])})
#get index of longest gene loci per gene target, if multiple, and use only the longest one as this one has all combinations of the whole locus
longest = this$targetGenes150bpDistance[which(tgLength == max(tgLength))]
Freq = sum(this$Freq)
output = data.frame(longest, Freq, stringsAsFactors = F)
output
})
#Replacing genes by the locus if multiple genes in the locus are targeted to account for multiple gene targets nearby the targeted region
df$GeneLocus_on_GeneLvL = as.character(df$Gene)
df$GeneLocus_on_GeneLvL[df$Gene %in% ddf$Gene] = sapply(df$Gene[df$Gene %in% ddf$Gene], function(pp){
geneReplacement = ddf$longest[ddf$Gene == pp]
geneReplacement
})
#Replacing essentiality for aggregated essentiality
df$essential_any_aggregate = as.character(df$essential_any)
essential_GeneLoci = unique(df$GeneLocus_on_GeneLvL[df$essential_any]) #34 loci
df$essential_any_aggregate[df$GeneLocus_on_GeneLvL %in% essential_GeneLoci] = TRUE
#list guides that are twice in table due to multiple targets
doubleSeqs=names(table(df$Seq)[table(df$Seq)>=2])
#get index of guides that are not Nearest_Guide_to_TSS to get rid of them
doubleSeqIndex = df$Seq %in% doubleSeqs & df$Nearest_Guide_to_TSS==F
df = df[!doubleSeqIndex, ]
#call output
df
}
tab_ThirtyFC = annotateComplexLoci(df=tab_ThirtyFC)
tab_ThirtyFC$essential_any_aggregate = as.logical(tab_ThirtyFC$essential_any_aggregate)
Defining significant gRNAs in preparation for defining gene significance
#Define gRNAs as significant if they exceed an absolute log2FC of 1 with an adjusted p-value (BH FDR) below 0.05 (parameters set in function).
#Define genes as significantly influenced if a gene is targeted by at least 2 significant gRNAs (see above) and the mean gene-level log2FC has an FDR below 0.05
callSig <- function(df, logFCcutoff){
df$positiveFC = df$logFC>=logFCcutoff & df$FDR<=0.05
df$negativeFC = df$logFC<=(-logFCcutoff) & df$FDR<=0.05
sigGenesPositive = names(which(table(df$ORF[df$positiveFC]) >=2))
sigGenesNegative = names(which(table(df$ORF[df$negativeFC]) >=2))
genesWithGuideCrossingPositiveDir = names(which(table(df$ORF[df$positiveFC]) >=2))
genesWithGuideCrossingNegativeDir = names(which(table(df$ORF[df$negativeFC]) >=2))
sigGenesPositive<-sigGenesPositive[!sigGenesPositive%in%genesWithGuideCrossingNegativeDir]
sigGenesNegative<-sigGenesNegative[!sigGenesNegative%in%genesWithGuideCrossingPositiveDir]
c(sigGenesPositive, sigGenesNegative)
}
dfs_list <- list(tab_ThirtyFC)
#apply callSig function to guides
loop_List <- lapply(dfs_list, function(u) {
sgenes <- callSig(df=u, logFCcutoff = 1)
u$sig <- u$ORF %in% sgenes
})
tab_ThirtyFC$sig <- unlist(loop_List[1])
#guide lvl
qplot(x=tab_ThirtyFC$logCPM, y=tab_ThirtyFC$logFC, col=factor(tab_ThirtyFC$FDR<0.05)) + theme_classic()
#colnames ORF and gene specify the designed target ORF and gene, not the actual targets:
colnames(tab_ThirtyFC)[10:17] = paste(colnames(tab_ThirtyFC)[10:17], "_targeted", sep="")
#remove columns that were used for merging metadata
tab_ThirtyFC = tab_ThirtyFC[, -c(2,8)]
#gRNA target distribution
table(tab_ThirtyFC$targetORFs150bpDistanceNumber)
1 2 3 4
1345 186 5 2
###write to file:
#write.table(tab_ThirtyFC, file="./GuideScore_Thirty.txt", quote=F, sep="\t", row.names = F)
plotTable_guides <- function(tab){
for (i in 1:length(rownames(tab))){
if ( (-log10( tab$FDR[i] )) <= 80){
tab$plot_fdr[i] = (-log10( tab$FDR[i] ))
tab$plot_pch[i] = 20
}
if ( (-log10( tab$FDR[i] )) > 80){
tab$plot_fdr[i] = 80
tab$plot_pch[i] = 17
}
if (tab$logFC[i] > (-15) | tab$logFC[i] < 5 ){
tab$logFC_trim[i] = tab$logFC[i]
}
if (tab$logFC[i] < (-15)){
tab$logFC_trim[i] = (-15)
tab$plot_pch[i] = 17
}
if (tab$logFC[i] > 5){
tab$logFC_trim[i] = 5
tab$plot_pch[i] = 17
}
if (tab$FDR[i] <= 0.05){
tab$FDR_col[i] <- alpha("grey4", 0.2)
if (tab$logFC[i] <= (-1) | tab$logFC[i] >= 1){
tab$FDR_col[i] <- alpha("chartreuse4", 0.2)
}
}
else{tab$FDR_col[i] <- alpha("darkred", 0.2)
}
}
return(tab)
}
#generate tables with values for plotting:
tab_ThirtyFC <- plotTable_guides(tab=tab_ThirtyFC)
volcanoPlot <- function(tbl){
plot(tbl$logFC_trim, tbl$plot_fdr, pch=tbl$plot_pch, col=tbl$FDR_col,
xlab = "log2 barcode fold change", ylab = "-log10 FDR", cex=1.4, cex.axis=1.4, cex.lab=1.4, xlim = c(-15.01,5.01), ylim = c(0,80))
legend(-16, 24, c("significant gRNA", "abs. log2FC<1", "FDR>0.05"),
col=c(alpha("chartreuse4", 0.2), alpha("grey4", 0.2), alpha("darkred", 0.2)),
pch=20, border="black", cex=0.75, bty = "n")
abline(h=(-log10(0.05)), v=c(-1,1), col = alpha("blue", 0.2), lwd=1, lty=2)
text(-12, 3, labels="FDR=0.05", cex = 0.75, col = alpha("blue", 0.5))
}
#pdf('../Volcano_Plots_gRNA_fold-changes/Volcano_gRNA_30degC.pdf', width=5, height=5)
volcanoPlot(tbl=tab_ThirtyFC)
#dev.off()
Compute gene log2FCs without prior calculation of gRNA log2FCs:
#edgeR pipeline to compute FDRs on on gene level (using geometric mean of raw read counts per gene)
fitGeneModel <- function(counts, annTable){
#compute offsets used for guide level normalization for usage in the gene model:
Group_guideLvL <- annTable$grp
y_guideLvL = DGEList(counts, group=Group_guideLvL)
y_guideLvL = calcNormFactors(y_guideLvL)
offsets_guideLvL <- getOffset(y_guideLvL)
###########
#Similar pipeline as for individual gRNAs above:
#merge annotations1
fullAnno$mergeCol <- sapply(fullAnno$mergeCol, function(u){ sub("late", "", u)})
counts$rname = rownames(counts)
counts$mergeCol <- sapply(counts$rname, function(u){ paste( unlist(strsplit(u, ":"))[c(2, 3, 4) ], collapse = ":") })
counts$mergeCol <- sapply(counts$mergeCol, function(u){ paste( unlist(strsplit(u, "late-"))[1]) })
counts$mergeCol <- sapply(counts$mergeCol, function(u){ sub("-noMode-noGuideCtr", "", u) })
#annotate aimed strand and genomic positions
mergedDF <- merge(counts, fullAnno, by="mergeCol")
#exclude wt. This value is only meaningful at the guide level, if at all:
mergedDF <- mergedDF[-grep("wt", mergedDF$Gene),]
###########################
#aggregate on gene level
mergedDF$Gene_ORF <- paste(mergedDF$Gene, mergedDF$ORF, sep="_")
counts_annotated <- mergedDF[,c(2,3,4,5,16)]
geneMatrix <- matrix(nrow= length(unique(counts_annotated$Gene_ORF)), ncol = (length(colnames(counts_annotated))-1) )
rownames(geneMatrix) <- unique(counts_annotated$Gene_ORF, sep="_")
colnames(geneMatrix) <- colnames(counts_annotated)[1:4]
geneMatrix <- data.frame(geneMatrix)
#transform guide RNA barcode read counts into natural log scale, compute mean of all log scale counts that target the same gene, transform back with natural exponential function.
#this procedure yields the geometric mean of a gRNA barcodes with similar gene feature.
for (i in rownames(geneMatrix)){
geneMatrix[rownames(geneMatrix)==i, ] <- apply(counts_annotated[counts_annotated$Gene_ORF==i, 1:4], 2, FUN=function(reads){
geomMean <- round( exp( mean(log1p(reads) ))) #,na.rm=T)) )
return(geomMean)})
}
Group <- annTable$grp
#generate DGEList and perform likelihood ratio test on gene level
y = DGEList(geneMatrix, group=Group)
#Use offsets of guideLvL!
y = calcNormFactors(y)
y$offset= offsets_guideLvL
plotMDS(y)
design <- model.matrix(~0+grp+replicate , data=annTable)
yDisp <- estimateDisp(y, design, robust=TRUE)
plotBCV(yDisp)
#plotSmear(yDisp)
fit = glmFit(yDisp, design, robust=T)
#test for ATc effect at 30degC
lrt = glmLRT(fit, contrast = c(-1, 1, 0))
tab_ThirtyFC_geneLVL = topTags((lrt),n=nrow(yDisp))$table
hist(tab_ThirtyFC_geneLVL$logFC, breaks=50)
hist(tab_ThirtyFC_geneLVL$FDR, breaks=50)
plotMD(lrt)
abline(h=c(-1,1), col="blue")
tab_ThirtyFC_geneLVL$Gene_ORF <- rownames(tab_ThirtyFC_geneLVL)
tabSelection = list(tab_ThirtyFC_geneLVL)
return(tabSelection)
}
tab_TF_geneLVL <- fitGeneModel(counts = Counts_TF, annTable = ann_TF)
tab_Kin_geneLVL <- fitGeneModel(counts = Counts_Kin, annTable = ann_Kin)
Thirty_geneLVL_geomMean <- rbind(tab_TF_geneLVL[[1]], tab_Kin_geneLVL[[1]])
#Replacing genes by the locus if multiple genes in the locus are targeted to account for multiple gene targets nearby the targeted region
geneLocusAnnotate <- function(geneLevelTable, guideLevelTable){
#create cloumn with gene and orf info
geneLevelTable$Gene = sapply( geneLevelTable$Gene_ORF, function(spp){
unlist(strsplit(spp, split="_", fixed = T))[1]} )
geneLevelTable$GeneLocus_on_GeneLvL = as.character(geneLevelTable$Gene)
#ddf table for annotation of genes in bidirectional promoter configuration and complex regions
ddf = data.frame("Gene_targeted" = guideLevelTable$Gene_targeted, "GeneLocus_on_GeneLvL" = guideLevelTable$GeneLocus_on_GeneLvL, stringsAsFactors = FALSE)
ddf = unique(ddf)
ddf$Gene_targeted = as.character(ddf$Gene_targeted)
ddf$GeneLocus_on_GeneLvL = as.character(ddf$GeneLocus_on_GeneLvL)
geneLevelTable$GeneLocus_on_GeneLvL[geneLevelTable$Gene %in% ddf$Gene_targeted] = sapply(geneLevelTable$Gene[geneLevelTable$Gene %in% ddf$Gene_targeted], function(pp){
geneReplacement = ddf$GeneLocus_on_GeneLvL[ddf$Gene_targeted == pp]
geneReplacement
})
#call output
geneLevelTable[order(rownames(geneLevelTable)), ]
}
Thirty_geneLVL_geomMean = geneLocusAnnotate(geneLevelTable=Thirty_geneLVL_geomMean, guideLevelTable=tab_ThirtyFC)
head(Thirty_geneLVL_geomMean)
#write.table(Thirty_geneLVL_geomMean,file="./Thirty_geneLVL_geomMean.txt",quote=F,sep="\t", row.names = F)
#all gene combinations
allGenes <- as.character(unique(tab_ThirtyFC$GeneLocus_on_GeneLvL)) #290 single genes and gene combinations/loci targeted
#Of the 290 targeted genes, 34 genes/gene loci are essential (11.7 %).
essentialGenes <- as.character(unique(tab_ThirtyFC$GeneLocus_on_GeneLvL[tab_ThirtyFC$essential_any == TRUE]))
#173 guide RNAs target regions with at least one essential gene in 150 bp distance:
essentialGuides <- as.character(unique(tab_ThirtyFC$Seq[tab_ThirtyFC$essential_any == TRUE]))
#gene lvl
qplot(x=Thirty_geneLVL_geomMean$logCPM, y=Thirty_geneLVL_geomMean$logFC, col=factor(Thirty_geneLVL_geomMean$FDR<0.05)) + theme_classic()
cols <- c("FDR<=0.01" = "dodgerblue4", "FDR<=0.05" = "dodgerblue1", "FDR>=0.05" = "grey40", "FDR>=0.05" = "grey40")
qplot(logCPM, logFC, data = Thirty_geneLVL_geomMean, colour = FDR, main = "log2 fold changes of genes \n(geometric mean of all gRNAs targeting one gene)") +
scale_colour_gradientn(colors = cols, values=c(0, 0.01, 0.05, 0.5), breaks=c(0, 0.01, 0.05, 0.5))
plotTable_geneLvL <- function(tab){
for (i in 1:length(rownames(tab))){
if ( (-log10( tab$FDR[i] )) <= 80){
tab$plot_fdr[i] = (-log10( tab$FDR[i] ))
tab$plot_pch[i] = 20
}
if ( (-log10( tab$FDR[i] )) > 80){
tab$plot_fdr[i] = 80
tab$plot_pch[i] = 17
}
if (tab$logFC[i] > (-5) | tab$logFC[i] < 2 ){
tab$logFC_trim[i] = tab$logFC[i]
}
if (tab$logFC[i] < (-5)){
tab$logFC_trim[i] = (-5)
tab$plot_pch[i] = 17
}
if (tab$logFC[i] > 2){
tab$logFC_trim[i] = 2
tab$plot_pch[i] = 17
}
if (tab$FDR[i] <= 0.05){
tab$FDR_col[i] <- alpha("grey4", 0.2)
if (tab$logFC[i] <= (-1) | tab$logFC[i] >= 1){
tab$FDR_col[i] <- alpha("chartreuse4", 0.2)
}
}
else{tab$FDR_col[i] <- alpha("darkred", 0.2)
}
}
return(tab)
}
volcanoPlot_geneLVL <- function(tbl){
plot(tbl$logFC_trim, tbl$plot_fdr, pch=tbl$plot_pch, col=tbl$FDR_col,
xlab = "log2 gene fold change", ylab = "-log10 FDR", cex=1.4, cex.axis=1.4, cex.lab=1.4, ylim = c(0,80), xlim = c(-5,2))
legend(-5.3, 28, c("gene with abs. log2FC>=1", "abs. log2FC<1", "FDR>0.05"),
col=c(alpha("darkred", 0.2), alpha("grey4", 0.2), alpha("chartreuse4", 0.2)),
pch=20, border="black", cex=0.85, bty = "n")
abline(h=(-log10(0.05)), v=c(-1,1), col = alpha("blue", 0.2), lwd=1, lty=2)
text(-4.4, 3, labels="FDR=0.05", cex = 0.8, col=alpha("blue", 0.2))
}
volcanoPlot_geneLVL(tbl=plotTable_geneLvL(Thirty_geneLVL_geomMean))
#Mean of guides as gene score, including significant gene annotation for genes:
GeneScore_Thirty <- aggregate(tab_ThirtyFC$logFC, list(tab_ThirtyFC$Gene_targeted, tab_ThirtyFC$ORF_targeted, tab_ThirtyFC$sig, tab_ThirtyFC$GeneLocus_on_GeneLvL, tab_ThirtyFC$essential_any_aggregate, tab_ThirtyFC$Identity_targeted, tab_ThirtyFC$Chromosome_targeted), mean)
colnames(GeneScore_Thirty) <- c("gene", "ORF", "sig", "GeneLocus_on_GeneLvL", "essential", "Identity", "Chromosome", "mean.score")
#Calculate median gene scores
GeneScore_Thirty$median.score <- aggregate(tab_ThirtyFC$logFC, list(tab_ThirtyFC$Gene_targeted, tab_ThirtyFC$ORF_targeted, tab_ThirtyFC$sig, tab_ThirtyFC$GeneLocus_on_GeneLvL, tab_ThirtyFC$essential_any_aggregate, tab_ThirtyFC$Identity_targeted, tab_ThirtyFC$Chromosome_targeted), median)[,8]
#sort by gene name column
GeneScore_Thirty <- GeneScore_Thirty[order(GeneScore_Thirty$gene),]
GeneScore_Thirty$gene = as.character(GeneScore_Thirty$gene)
#calculate gene score based on one single most extreme guide (or zz most extreme guides)
zz = 1
function_MaxGuides <- function(guide_tbl){
#use by function to group the guide score table by gene
grouped_tbl <- by(guide_tbl, as.character(guide_tbl$Gene_targeted), function(x){
#sort the gene subtables by decreasing absolute logFC
guideSort_by_absFC <- x[order(abs(x$logFC), decreasing = T),]
GeneScore <- mean(guideSort_by_absFC$logFC[1:zz])
GeneScore
})
return(as.numeric(grouped_tbl))
}
GeneScore_Thirty$maxGuide.score <- function_MaxGuides(guide_tbl=tab_ThirtyFC)
#calculate gene score based on 2 most extreme guides
zz = 2
GeneScore_Thirty$maxTwoGuides.score <- function_MaxGuides(guide_tbl=tab_ThirtyFC)
#calculate gene score based on 3 most extreme guides
zz = 3
GeneScore_Thirty$maxThreeGuides.score <- function_MaxGuides(guide_tbl=tab_ThirtyFC)
#calculate gene score based on 4 most extreme guides
zz = 4
GeneScore_Thirty$maxFourGuides.score <- function_MaxGuides(guide_tbl=tab_ThirtyFC)
#calculate gene score based on 5 most extreme guides
zz = 5
GeneScore_Thirty$maxFiveGuides.score <- function_MaxGuides(guide_tbl=tab_ThirtyFC)
#exemplary plots
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$maxGuide.score)
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$median.score)
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$maxTwoGuides.score)
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$maxThreeGuides.score)
heatscatter(GeneScore_Thirty$median.score, GeneScore_Thirty$maxTwoGuides.score)
#34 of the 290 genes are essential:
table(GeneScore_Thirty$essential)
FALSE TRUE
256 34
#exemplary plots with essential genes
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$maxGuide.score)
points(GeneScore_Thirty$mean.score[GeneScore_Thirty$essential], GeneScore_Thirty$maxGuide.score[GeneScore_Thirty$essential], col="green", pch=23)
abline(h=0, col=alpha("black", 0.25))
abline(v=0, col=alpha("black", 0.25))
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$median.score)
points(GeneScore_Thirty$mean.score[GeneScore_Thirty$essential], GeneScore_Thirty$median.score[GeneScore_Thirty$essential], col="green", pch=23)
abline(h=0, col=alpha("black", 0.25))
abline(v=0, col=alpha("black", 0.25))
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$maxTwoGuides.score)
points(GeneScore_Thirty$mean.score[GeneScore_Thirty$essential], GeneScore_Thirty$maxTwoGuides.score[GeneScore_Thirty$essential], col="green", pch=23)
abline(h=0, col=alpha("black", 0.25))
abline(v=0, col=alpha("black", 0.25))
heatscatter(GeneScore_Thirty$mean.score, GeneScore_Thirty$maxThreeGuides.score)
points(GeneScore_Thirty$mean.score[GeneScore_Thirty$essential], GeneScore_Thirty$maxThreeGuides.score[GeneScore_Thirty$essential], col="green", pch=23)
abline(h=0, col=alpha("black", 0.25))
abline(v=0, col=alpha("black", 0.25))
heatscatter(GeneScore_Thirty$median.score, GeneScore_Thirty$maxTwoGuides.score)
points(GeneScore_Thirty$median.score[GeneScore_Thirty$essential], GeneScore_Thirty$maxTwoGuides.score[GeneScore_Thirty$essential], col="green", pch=23)
abline(h=0, col=alpha("black", 0.25))
abline(v=0, col=alpha("black", 0.25))
df_list <- list(df1=GeneScore_Thirty)
loop_dfList <- lapply(df_list, function(df) {
df$z_mean.score <- ( df$mean.score - mean(df$mean.score) )/sd(df$mean.score)
df$z_median.score <- ( df$median.score - mean(df$median.score) )/sd(df$median.score)
df$z_maxGuide.score <- ( df$maxGuide.score - mean(na.omit(df$maxGuide.score)) )/sd(na.omit(df$maxGuide.score))
df$z_maxTwoGuides.score <- ( df$maxTwoGuides.score - mean(na.omit(df$maxTwoGuides.score)) )/sd(na.omit(df$maxTwoGuides.score))
df$z_maxThreeGuides.score <- ( df$maxThreeGuides.score - mean(na.omit(df$maxThreeGuides.score)) )/sd(na.omit(df$maxThreeGuides.score))
df$z_maxFourGuides.score <- ( df$maxFourGuides.score - mean(na.omit(df$maxFourGuides.score)) )/sd(na.omit(df$maxFourGuides.score))
#order by gene locus
df[order(df$GeneLocus_on_GeneLvL), ]
return(df)
})
GeneScore_Thirty = loop_dfList$df1
#append geometric mean logFCs and FDR on gene level from edgeR pipeline with geometric mean:
Thirty_geneLVL_geomMean = Thirty_geneLVL_geomMean[order(Thirty_geneLVL_geomMean$Gene), ]
GeneScore_Thirty$geomMean_logFC <- Thirty_geneLVL_geomMean$logFC
GeneScore_Thirty$geomMean_FDR <- Thirty_geneLVL_geomMean$FDR
#Compare geneLVL_geometric_Mean of guide RNAs with mean of gRNA logFCs:
spearmanR = paste("r = ", as.character( round( cor(x=GeneScore_Thirty$mean.score, y=GeneScore_Thirty$geomMean_logFC, method='spearman'), digits=3) ) )
ggplot(GeneScore_Thirty) +
geom_point(aes(x=mean.score, y=geomMean_logFC, colour=essential), shape=19, size=2) +
annotate("text", x = -1.5, y = 0.5, color="black", label=spearmanR) +
scale_x_continuous(breaks=c(-10,-5, -2,-1,0,1), limits = c(-10, 1)) +
scale_y_continuous(breaks=c(-10,-5, -2,-1,0,1), limits = c(-10, 1)) +
scale_color_manual(values = c( alpha("chartreuse3", .1), alpha("orange3", .8)) ) +
theme_classic() +
theme(axis.text=element_text(size=15, colour = "black"),
axis.title = element_text(size = 12, colour = "black"),
axis.ticks = element_line(size = 0.8, colour = "black"),
axis.ticks.length=unit(0.2,"cm"),
legend.position="none"
#legend.title=element_blank(), legend.position="top"
) +
geom_hline(yintercept=0, linetype="dashed", color="#555555" ) +
geom_vline(xintercept=0, linetype="dashed", color="#555555" ) +
geom_abline(linetype="dashed", color="#555555") +
xlab("Geometric mean of log-scale barcode fold changes per gene") +
ylab("Log-scale fold change of the geometric mean \nof read counts per gene")
#Vertical Boxplot:
#select genes
selectionGeneList <- c("MSN2", "MSN4", "HSF1", "SCH9", "RSC3", "TEA1")
selectPlot <- tab_ThirtyFC[tab_ThirtyFC$Gene_targeted %in% selectionGeneList, ]
selectPlot$Gene_targeted <- factor(selectPlot$Gene_targeted, levels = selectionGeneList)
ggplot(data=selectPlot,
mapping = aes(x = GeneLocus_on_GeneLvL, y = as.numeric(logFC))) +
geom_boxplot(outlier.color = "transparent") +
geom_hline(yintercept=1, linetype="dashed", color="darkblue") +
geom_hline(yintercept=(-1), linetype="dashed", color="darkblue") +
geom_hline(yintercept=0, linetype="dashed", color="darkblue") +
# scale_y_continuous(breaks=seq(-3,1,1), limits = c(-3.5,1.5)) +
geom_jitter(aes(color=FDR), size = 3, width = 0.1) +
scale_color_gradientn(colors = c("black", "grey50", "grey60", "grey70", "grey75", "grey80", "grey85", "grey90"), na.value = "red", limits=c(0,1), breaks=c(0,0.1,0.2,0.5,1)) + #"darkblue")) +
theme_classic() +
theme(axis.text.x=element_text(angle=90, size=10, vjust = 0.5,
color="black"),
axis.text.y=element_text(angle=90, size=10, hjust = 0.5, color = "black"),
axis.title = element_text(size = rel(1.5), face="bold"),
axis.ticks = element_line(size = 0.9),
axis.ticks.length=unit(0.2,"cm")
) +
xlab("Target Gene") +
ylab("gRNA fold change") +
ggtitle(label="Repression effect on Growth at 30°C") +
geom_point(stat="summary", fun.y = "mean", colour="red3", size=5, pch=18)
Warning: Ignoring unknown parameters: fun.y
No summary function supplied, defaulting to `mean_se()`
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Catalina 10.15.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] LSD_4.1-0 scales_1.1.1 dplyr_1.0.0 pheatmap_1.0.12
[5] zoo_1.8-8 ggplot2_3.3.2 statmod_1.4.34 edgeR_3.26.8
[9] limma_3.40.6 stringr_1.4.0 knitr_1.29
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 RColorBrewer_1.1-2 compiler_3.6.1 pillar_1.4.6
[5] tools_3.6.1 digest_0.6.25 evaluate_0.14 lifecycle_0.2.0
[9] tibble_3.0.3 gtable_0.3.0 lattice_0.20-41 pkgconfig_2.0.3
[13] rlang_0.4.7 rstudioapi_0.11 yaml_2.2.1 xfun_0.15
[17] withr_2.2.0 generics_0.0.2 vctrs_0.3.1 locfit_1.5-9.4
[21] grid_3.6.1 tidyselect_1.1.0 glue_1.4.1 R6_2.4.1
[25] rmarkdown_2.3 farver_2.0.3 purrr_0.3.4 magrittr_1.5
[29] htmltools_0.5.0 ellipsis_0.3.1 colorspace_1.4-1 labeling_0.3
[33] stringi_1.4.6 munsell_0.5.0 crayon_1.3.4